Reading data ..
raw_df <- read.csv("processed_df_5_19.csv")
names(raw_df) <- str_replace_all(names(raw_df), c(" " = "_"))
raw_df$date = as.Date(raw_df$date)
dim(raw_df)## [1] 266296 11
## [1] 16123
Roll up the data at a day level and remove observations less than 3 days
day_level <- raw_df %>%
group_by(Dim_Patient, trimester, Observation_Source, date, week, days) %>%
summarise(wt_avg = mean(Observation_Value))
gtr_3 <- day_level %>%
group_by(Dim_Patient) %>%
count() %>% filter(n > 2)
day_level <- day_level %>% filter(Dim_Patient %in% gtr_3$Dim_Patient)
length(unique(day_level$Dim_Patient))## [1] 16071
## [1] 244150 7
day_level <-day_level %>% group_by(Dim_Patient) %>%
mutate( num_of_observ = n(),
# engagement = ifelse(num_of_observ >6, "good", "bad")
)
#write.csv(day_level, "day_level_5_19.csv")
g1 <- ggplot(gtr_3, aes(x=n)) + geom_histogram()
ggplotly(g1)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Total number of rows and columns are 244150, 8 and total number of unique patients are 16071
Roll up the data at a week level and remove observations just 1 week
week_level<- raw_df %>% group_by(Dim_Patient, trimester, Observation_Source, week) %>% summarise(wt_avg = mean(Observation_Value))
gtr_1 <- week_level %>% group_by(Dim_Patient) %>% count() %>% filter(n>1)
week_level <- week_level %>% filter(Dim_Patient %in% gtr_1$Dim_Patient)
week_level <-week_level %>% group_by(Dim_Patient) %>%
mutate( num_of_observ = n(),
engagement = ifelse(num_of_observ >6, "good", "bad"))
length(unique(week_level$Dim_Patient))## [1] 16108
## [1] 186956 7
#write.csv(week_level, "week_level_5_19.csv")
# remove if weeks 2 and week diff less than 8
g1 <- ggplot(gtr_1, aes(x=n)) + geom_histogram()
ggplotly(g1)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Merge good list with patient at a day level
good_list <- week_level %>% select(Dim_Patient, engagement) %>% distinct()
day_level <- merge(day_level, good_list, by= "Dim_Patient", all.x=T)Total number of rows and columns are 186956, 7 and total number of unique patients are 16108
Calculate wt gain by taking first entry of the patient and last week prior week 41
initial_wt <- day_level %>% group_by(Dim_Patient) %>%
filter(week<=40) %>%
arrange(Dim_Patient, date) %>% filter(row_number()==1 | row_number()==n()) %>%
mutate(n = n()) %>%
filter(n>1) %>%
arrange(Dim_Patient, date) %>%
summarise(weight_gain = diff(wt_avg))
initial_wt %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Merge Age, height and calculate BMI
ht <- read_excel('GWG(2+)_Corrected.5.10.20(lowcount).xlsx', sheet ='demographics')
names(ht) <- str_replace_all(names(ht), c(" " = "_"))
day_level_demo <- merge(day_level, ht[,c("Dim_Patient", "Activated_At",
"Date_of_Birth", "Age_at_Activation_(years)",
"Customer_Group", "Height_(m)")],
by= "Dim_Patient", all.x = TRUE)
day_level_demo$weight_kg <- day_level_demo$wt_avg/2.2046Calculate BMI
init_wt <-day_level_demo %>% group_by(Dim_Patient) %>% arrange(Dim_Patient,date) %>% filter(row_number() == 1)
# initial weight bmi
init_wt$bmi <- init_wt$weight_kg/(init_wt$`Height_(m)`^2)
day_level_demo <- merge(day_level_demo, init_wt[, c("Dim_Patient", "bmi")], by = "Dim_Patient", all.x = T)
day_level_demo$bmi_category <- ifelse(day_level_demo$bmi <18.5, "Underweight",
ifelse(day_level_demo$bmi<25, "Normal Weight",
ifelse(day_level_demo$bmi<30, "Overweight",
"Obese")))
# check patient having ht less than 0.5
lessht <- day_level_demo %>% filter(`Height_(m)`<0.6)
length(unique(lessht))## [1] 17
# weight gain = T1-T3
weight_gain <- day_level_demo %>%
group_by(Dim_Patient) %>%
arrange(Dim_Patient,date) %>%
filter((row_number()==1 & trimester== "T1") | (row_number()==n() & trimester== "T3")) %>%
mutate(n = n()) %>%
filter(n>1) %>% arrange(Dim_Patient, trimester) %>%
summarise(weight_gain = diff(wt_avg))
day_level_demo <- merge(day_level_demo,initial_wt, by= "Dim_Patient", all.x = T)
# create a categorical variable for bmi within and outside recommended range
day_level_demo$bmi_flag <- ifelse(is.na(day_level_demo$weight_gain),NA,
ifelse((day_level_demo$bmi_category=="Underweight") &
(day_level_demo$weight_gain < 40) &
(day_level_demo$weight_gain > 28), 1,
ifelse((day_level_demo$bmi_category=="Normal Weight") &
(day_level_demo$weight_gain > 25) &
(day_level_demo$weight_gain <35), 1,
ifelse((day_level_demo$bmi_category=="Overweight") &
(day_level_demo$weight_gain > 15) &
(day_level_demo$weight_gain <25),1,
ifelse((day_level_demo$bmi_category=="Obese") &
(day_level_demo$weight_gain > 11) &
(day_level_demo$weight_gain <20),1,0)))))
# day_level_demo %>% filter((day_level_demo$bmi_category=="Overweight") &
# (day_level_demo$weight_gain > 15) &
# (day_level_demo$weight_gain <25) & (Dim_Patient == "ref:Patient/Athena#13122|1966456"))
# calculate wt difference
day_level_demo <- day_level_demo %>%
group_by(Dim_Patient) %>%
arrange(Dim_Patient,date) %>%
mutate(wt_diff = c(0, diff(wt_avg)))
# cross tab
crosstab(day_level_demo, col.vars = "bmi_flag", row.vars = "bmi_category", type = "f")## bmi_flag 0 1 Sum
## bmi_category
## Normal Weight 85641 27686 113327
## Obese 44663 13689 58352
## Overweight 49049 18572 67621
## Underweight 2714 1009 3723
## Sum 182067 60956 243023
Merge week level data
week_level_demo <- merge(week_level, ht[,c("Dim_Patient", "Activated_At",
"Date_of_Birth", "Age_at_Activation_(years)",
"Customer_Group", "Height_(m)")],
by= "Dim_Patient", all.x = TRUE)
# weight gain
week_level_demo <- merge(week_level_demo, initial_wt, by= "Dim_Patient", all.x = T)
# BMI
week_level_demo <- merge(week_level_demo, init_wt[, c("Dim_Patient", "bmi")], by = "Dim_Patient", all.x = T)
week_level_demo$bmi_category <- ifelse(week_level_demo$bmi <18.5, "Underweight",
ifelse(week_level_demo$bmi<25, "Normal Weight",
ifelse(week_level_demo$bmi<30, "Overweight",
"Obese")))
# create a categorical variable for bmi within and outside recommended range
week_level_demo$bmi_flag <- ifelse(is.na(week_level_demo$weight_gain),NA,
ifelse((week_level_demo$bmi_category=="Underweight") &
(week_level_demo$weight_gain < 40) &
(week_level_demo$weight_gain > 28), 1,
ifelse((week_level_demo$bmi_category=="Normal Weight") &
(week_level_demo$weight_gain > 25) &
(week_level_demo$weight_gain <35), 1,
ifelse((week_level_demo$bmi_category=="Overweight") &
(week_level_demo$weight_gain > 15) &
(week_level_demo$weight_gain <25), 1,
ifelse((week_level_demo$bmi_category=="Obese") &
(week_level_demo$weight_gain > 11) &
(week_level_demo$weight_gain <20),1,0)))))
# calculate wt difference
week_level_demo <- week_level_demo %>%
group_by(Dim_Patient) %>%
arrange(Dim_Patient,week) %>%
mutate(wt_diff = c(0, diff(wt_avg)))
#week_level_demo %>% filter((week_level_demo$bmi_category=="Obese"))
#write.csv(week_level_demo, "week_level_demo_6_2.csv")Class counts - 0 is outside bmi range and 1 is within bmi range
sub_week <- week_level_demo %>%
filter(!is.na(bmi_flag)) %>%
filter(!is.na(bmi_category))
count_sub_week <- sub_week %>%
group_by(Dim_Patient, bmi_flag) %>%
count()
# check class bias
table(count_sub_week$bmi_flag)##
## 0 1
## 12527 3414
## bmi_flag 0 1 Sum
## bmi_category
## Normal Weight 62893 21440 84333
## Obese 35712 11118 46830
## Overweight 37261 14623 51884
## Underweight 2056 855 2911
## Sum 137922 48036 185958
Before performing t-test we need to verify if the following t-test assumptions are valid:
Assumption1: Are the two samples independents? Yes
Assumption2: Are the data from each of the 2 groups follow a normal distribution? shapiro.test() to compute Shapiro-Wilk test for each group of samples.
Can’t perform Shapiro test as sample size is large //Shapiro test fails, that means data is not normally distributed, but the sample size is large enough (n //> 30), so we can ignore the distribution of the data and use parametric tests.
Assumption3 - have same variance:
##
## F test to compare two variances
##
## data: n by bmi_flag
## F = 1.0349, num df = 12526, denom df = 3413, p-value = 0.2127
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.9805347 1.0913305
## sample estimates:
## ratio of variances
## 1.03488
The p-value of F-test is p = 0.2127. It’s greater than the significance level alpha = 0.05. In conclusion, there is no significant difference between the variances of the two sets of data. Therefore, we can use the classic t-test which assume equality of the two variances
#Wilcoxon rank-sum test non parametric test
# t test
ttest<- t.test(n~bmi_flag, data = count_sub_week)
ttest##
## Welch Two Sample t-test
##
## data: n by bmi_flag
## t = -20.161, df = 5490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.357894 -2.762747
## sample estimates:
## mean in group 0 mean in group 1
## 11.00998 14.07030
T test shows that the mean of the two samples is not same.
# 2: Are the data from each of the 2 groups follow a normal distribution? shapiro.test() to compute Shapiro-Wilk test for each group of samples.
# 3 Do the two populations have the same variances? F-test to test for homogeneity in variances. This can be performed with the function var.test()
#
# # Run ANOVA
# anova_wt_bmi <- aov(weight_gain~bmi_flag, data = sub_week)
# summary(anova_wt_bmi)
# anova_wt_bmiUse number of observations (as a proxy to engagement) to find if recommended weight gain will be within the range or outside
##
## Call:
## glm(formula = bmi_flag ~ n, family = binomial(link = "logit"),
## data = count_sub_week)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2555 -0.7016 -0.6105 -0.5745 1.9599
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.850540 0.035902 -51.54 <2e-16 ***
## n 0.044182 0.002278 19.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16560 on 15940 degrees of freedom
## Residual deviance: 16191 on 15939 degrees of freedom
## AIC: 16195
##
## Number of Fisher Scoring iterations: 4
Frequency distribution of observations
To make the results more interpretable, forming 3 classes of observations - low freq(0-6 weeks), moderate freq (6-15 weeks), high freq (15 onwards).
count_sub_week$obs_categories <- ifelse(count_sub_week$n <5,"Low freq",
ifelse(count_sub_week$n >=5 & count_sub_week$n <=15,"Moderate freq",
"High freq"))
table(count_sub_week$obs_categories)##
## High freq Low freq Moderate freq
## 4434 3279 8228
##
## 0 1
## 12527 3414
## obs_categories High freq Low freq Moderate freq Sum
## bmi_flag
## 0 3119 2962 6446 12527
## 1 1315 317 1782 3414
## Sum 4434 3279 8228 15941
Now perform logistic regression with the observation classes.
logit <- glm(bmi_flag~obs_categories, data=count_sub_week, family=binomial(link="logit"))
summary(logit)##
## Call:
## glm(formula = bmi_flag ~ obs_categories, family = binomial(link = "logit"),
## data = count_sub_week)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8388 -0.6987 -0.6987 -0.4509 2.1617
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.86368 0.03288 -26.268 <2e-16 ***
## obs_categoriesLow freq -1.37104 0.06762 -20.276 <2e-16 ***
## obs_categoriesModerate freq -0.42205 0.04240 -9.955 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16560 on 15940 degrees of freedom
## Residual deviance: 16074 on 15938 degrees of freedom
## AIC: 16080
##
## Number of Fisher Scoring iterations: 4
having low freq(less engagement), versus high freq(more engagement), changes the log odds of weight gain within the recommended range by -1.37104 similarly it changes by -0.42205 in case of moderate freq
In other words, log of odds ratio to be within bmi range decreases by 1.37 when frequency counts changes from high class to low class.
Odds ratio:
## (Intercept) obs_categoriesLow freq
## 0.4216095 0.2538422
## obs_categoriesModerate freq
## 0.6557028
To make the results more interpretable, forming 3 classes of observations - low freq(0-4 weeks), moderate freq (4-10 weeks), high freq (10 onwards).
# By practice group
q1_groups <- day_level_demo %>% select(Customer_Group, Dim_Patient) %>% unique() %>%
group_by(Customer_Group) %>% count() %>% arrange(n)
q1_groups$Customer_Group <- factor(q1_groups$Customer_Group, levels = unique(q1_groups$Customer_Group))
q1_groups %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))#plot_ly(q1_groups, y = ~q1_groups$Customer_Group, x = ~q1_groups$n, type = 'bar', orientation = 'h')
plt1 <- ggplot(q1_groups, aes(x =Customer_Group, y =n)) +
geom_bar(stat = "identity", fill = 'blue') + #theme(axis.text.x = element_text(angle = 90)) +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
ggtitle("Total patients across patient groups") +
ylab("Count of Patients") +
xlab("Patient groups")
ggplotly(plt1)# Age bracket
age <- day_level_demo %>% filter((`Age_at_Activation_(years)`>14) &
(`Age_at_Activation_(years)`<60))
length(unique(day_level_demo$Dim_Patient)) - length(unique(age$Dim_Patient))## [1] 65
age$age_category <- ifelse(age$`Age_at_Activation_(years)`<20, "14-19",
ifelse(age$`Age_at_Activation_(years)`<25, "20-24",
ifelse(age$`Age_at_Activation_(years)`<30, "25-29",
ifelse(age$`Age_at_Activation_(years)`<35,"30-34",
ifelse(age$`Age_at_Activation_(years)`<40,"35-39",
ifelse(age$`Age_at_Activation_(years)`<45,"40-44",
ifelse(age$`Age_at_Activation_(years)`<50,"45-49",
ifelse(age$`Age_at_Activation_(years)`<55,"50-54", "55-60"))))))))
#age$age_category <- as.numeric(cut(age$`Age_at_Activation_(years)`, 9))
q1_age <- age %>% select(Dim_Patient, age_category) %>% unique() %>% group_by(age_category) %>% count()
q1_age %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))bmi <- day_level_demo %>% filter((!is.na(bmi_category)) & (`Height_(m)` >0.6))
q1_bmi <- bmi %>% select(Dim_Patient, bmi_category) %>% unique() %>%group_by(bmi_category) %>% count()
q1_bmi %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))plt1 <- ggplot(q1_bmi, aes(x =bmi_category, y =n)) +
geom_bar(stat = "identity", fill='darkblue') +
ggtitle("Total patients across different BMI groups") +
ylab("Count of Patients") +
xlab("BMI Categories") + scale_x_discrete(limits=c("Underweight", "Normal Weight", "Overweight", "Obese"))
ggplotly(plt1) - Entire population
- Practice group
- Age bracket (5 years)
- BMI category (less than 25 ; 25.1 - 29.9 ; 30-34.9 ; 35.0-39.9 ; 40-59.9 ; 60 -over)
Entire Population
q2_whole <- day_level_demo %>%
group_by(Dim_Patient) %>%
summarize(
total_obser = n(),
start_week = min(week),
start_date = min(date),
end_date = max(date),
length = end_date - start_date
)
q2_whole %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Practice groups
q2_prac <- day_level_demo %>%
group_by(Customer_Group, Dim_Patient) %>%
summarise(
count = n(),
start_week = min(week),
start_date = min(date),
end_date = max(date),
length = end_date - start_date
) %>%
ungroup() %>%
group_by(Customer_Group) %>%
summarise(
avg_count = mean(count),
std_count = sd(count),
mean_week = mean(start_week),
avg_length = mean(length),
unique_patients = n_distinct(Dim_Patient)
)
q2_prac %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Age categories
q2_age <- age %>% group_by(age_category, Dim_Patient) %>%
summarise(count = n(), start_week = min(week),
start_date = min(date), end_date = max(date), length = end_date - start_date) %>%
ungroup() %>%
group_by(age_category) %>%
summarise(avg_count = mean(count),
std_count = sd(count),
mean_week = mean(start_week),
avg_length = mean(length),
unique_patients = n_distinct(Dim_Patient))
q2_age %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))BMI table
q2_bmi <- bmi %>%
group_by(bmi_category, Dim_Patient) %>%
summarise(count = n(),
start_week = min(week),
start_date = min(date), end_date = max(date), length = end_date - start_date) %>%
ungroup() %>%
group_by(bmi_category) %>%
summarise(avg_count = mean(count),
std_count = sd(count),
mean_week = mean(start_week),
avg_length = mean(length),
unique_patients = n_distinct(Dim_Patient))
q2_bmi %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))--same as previous
# do this for week
q3_whole <- day_level_demo %>% group_by(Dim_Patient) %>% summarise(start_week = min(week)) %>%
group_by(start_week) %>% count()
plt1 <- ggplot(q3_whole, aes(x =start_week, y =n)) +
geom_bar(stat = "identity", fill='darkblue') +
ggtitle("Start week of all patients") +
ylab("Count of Patients") +
xlab("Start week")
ggplotly(plt1)q3_prac <- day_level_demo %>% group_by(Customer_Group, Dim_Patient) %>%
summarise(start_week = min(week)) %>% ungroup() %>% group_by(Customer_Group) %>%
summarise(mean_week = mean(start_week))
q3_age <- age %>% group_by(age_category, Dim_Patient) %>%
summarise(start_week = min(week)) %>% ungroup() %>% group_by(age_category) %>%
summarise(mean_week = mean.Date(start_week))
q3_bmi <- bmi %>% group_by(bmi_category, Dim_Patient) %>%
summarise(start_week = min(week)) %>% ungroup() %>% group_by(bmi_category) %>%
summarise(mean_week = mean.Date(start_week))-- same as previous
q4_whole <- day_level_demo %>% group_by(Dim_Patient) %>% summarise(start_date = min(date), end_date = max(date), length = end_date - start_date)
# q4_whole$length2 = as.numeric(q4_whole$length)
# g1 <- ggplot(q4_whole, aes(x=length2)) +geom_histogram()
# ht1 <- hist(q4_whole$length2)
# ggplotly(g1)
q4_prac <- day_level_demo %>% group_by(Customer_Group, Dim_Patient) %>%
summarise(start_date = min(date), end_date = max(date), length = end_date - start_date) %>%
ungroup() %>% group_by(Customer_Group) %>%
summarise(avg_length = mean(length)) %>% arrange(avg_length)
q4_prac$Customer_Group <- factor(q4_prac$Customer_Group, levels = unique(q4_prac$Customer_Group))
plt1 <- ggplot(q4_prac, aes(x =Customer_Group, y =avg_length)) +
geom_bar(stat = "identity", fill = 'blue') + #theme(axis.text.x = element_text(angle = 90)) +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
ggtitle("Total patients across patient groups") +
ylab("Count of Patients") +
xlab("Patient groups")
ggplotly(plt1)## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
q4_age <- age %>% group_by(age_category, Dim_Patient) %>%
summarise(start_date = min(date), end_date = max(date), length = end_date - start_date) %>%
ungroup() %>% group_by(age_category) %>%
summarise(mean_date = mean(length))
q4_bmi <- bmi %>% group_by(bmi_category, Dim_Patient) %>%
summarise(start_date = min(date), end_date = max(date), length = end_date - start_date) %>%
ungroup() %>% group_by(bmi_category) %>%
summarise(mean_date = mean(length)) - Whole group
- Practice group
- Age
- BMI group
Whole group
q5_whole <- day_level_demo %>% group_by(Dim_Patient, trimester) %>%
arrange(Dim_Patient, date) %>% filter(row_number()==1 | row_number()==n()) %>%
mutate(n = n()) %>%
filter(n>1) %>%
summarise(weight_diff = diff(wt_avg))
q5_whole %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Practice group
q5_prac <- day_level_demo %>% group_by(Customer_Group, Dim_Patient, trimester) %>%
arrange(Customer_Group, Dim_Patient, date) %>% filter(row_number()==1 | row_number()==n()) %>%
mutate(n = n()) %>%
filter(n>1) %>%
summarise(weight_diff = diff(wt_avg)) %>% ungroup() %>%
group_by(Customer_Group, trimester) %>% summarise(avg_weight_diff = mean(weight_diff),
std_wt_diff = sd(weight_diff))
q5_prac %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Age
q5_age <- age %>% group_by(age_category, Dim_Patient, trimester) %>%
arrange(Dim_Patient, date) %>% filter(row_number()==1 | row_number()==n()) %>%
mutate(n = n()) %>%
filter(n>1) %>%
summarise(weight_diff = diff(wt_avg)) %>% ungroup() %>%
group_by(age_category, trimester) %>% summarise(avg_weight_diff = mean(weight_diff),
std_wt_diff = sd(weight_diff))
q5_age %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))q5_bmi <- bmi %>% group_by(bmi_category, Dim_Patient, trimester) %>%
arrange(Dim_Patient, date) %>% filter(row_number()==1 | row_number()==n()) %>%
mutate(n = n()) %>%
filter(n>1) %>%
summarise(weight_diff = diff(wt_avg),
week_diff = diff(week),
range = ifelse(is.infinite(abs(weight_diff/week_diff)), 1, weight_diff/week_diff)) %>%
ungroup() %>%
group_by(bmi_category, trimester) %>%
summarise(avg_weight_diff = mean(weight_diff),
std_wt_diff = sd(weight_diff),
mean_range = mean(range, na.rm = T))The values do not reside in the recommended range of the reference table
q5_bmi %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Anova in T2 to change if the bmi groups have different means
q5 <- bmi %>% filter(trimester %in% c("T2", "T3")) %>%
group_by(bmi_category, Dim_Patient, trimester) %>%
arrange(Dim_Patient, date) %>% filter(row_number()==1 | row_number()==n()) %>%
mutate(n = n()) %>%
filter(n>1) %>%
summarise(`Age_at_Activation_(years)` = min(`Age_at_Activation_(years)`),
weight_diff = diff(wt_avg),
week_diff = diff(week),
rate_of_wtgain = ifelse(is.infinite(abs(weight_diff/week_diff)), 1, weight_diff/week_diff)) %>% filter(!is.na(rate_of_wtgain))
count <- bmi %>% group_by(Dim_Patient, trimester) %>% count()
q5 <- merge(q5, count, by = c("Dim_Patient", "trimester"), all.x = T)Frequency distribution of rate of weight gain in T2 and T3
Perform anova to test if the bmi groups have different means for rate of weight gain
## Anova Table (Type II tests)
##
## Response: rate_of_wtgain
## Sum Sq Df F value Pr(>F)
## bmi_category 394.7 3 184.33 < 2.2e-16 ***
## Residuals 9436.3 13219
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova test shows that there is a significat diff within the bmi groups when rate of weight is used as a response variable. The same is observed in case of T3
## Anova Table (Type II tests)
##
## Response: rate_of_wtgain
## Sum Sq Df F value Pr(>F)
## bmi_category 57.2 3 9.7097 2.149e-06 ***
## Residuals 18514.8 9431
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Following table shows the interaction between bmi category and trimester on the rate of weight gain has a significant effect. As well as, bmi category and trimester variables have a significant impact on the rate of weight gain.
## Anova Table (Type II tests)
##
## Response: rate_of_wtgain
## Sum Sq Df F value Pr(>F)
## bmi_category 443.3 3 120.559 < 2.2e-16 ***
## trimester 172.6 1 140.787 < 2.2e-16 ***
## n 190.4 1 155.352 < 2.2e-16 ***
## bmi_category:trimester 59.0 3 16.055 2.014e-10 ***
## Residuals 27760.8 22649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Lets build a linear regression model for rate of wt gain as a response variable and bmi category, trimester, age as predictors.
model <- lm(rate_of_wtgain ~ bmi_category + trimester + n + `Age_at_Activation_(years)`, data=q5)
summary(model)##
## Call:
## lm(formula = rate_of_wtgain ~ bmi_category + trimester + n +
## `Age_at_Activation_(years)`, data = q5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.623 -0.382 0.025 0.429 49.899
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.380249 0.034994 39.442 < 2e-16 ***
## bmi_categoryObese -0.336409 0.018109 -18.577 < 2e-16 ***
## bmi_categoryOverweight -0.083970 0.017851 -4.704 2.57e-06 ***
## bmi_categoryUnderweight 0.069644 0.060369 1.154 0.249
## trimesterT3 -0.176783 0.014936 -11.836 < 2e-16 ***
## n -0.010879 0.000907 -11.994 < 2e-16 ***
## `Age_at_Activation_(years)` -0.004743 0.001040 -4.560 5.15e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.108 on 22651 degrees of freedom
## Multiple R-squared: 0.02706, Adjusted R-squared: 0.0268
## F-statistic: 105 on 6 and 22651 DF, p-value: < 2.2e-16
Model shows that bmi categories, trimester, number of observations and age are statistically significant. The interaction between bmi category and trimester is not significant.
The R square value shows how much the model explains the variability around the mean in the response variable. In this case, the R sqaure low ~2.9% Hence the model with the variables bmi categories, trimester, number of observations and age does not fit well with the data.
To understand what does it mean when only category is a significant variable and what to do in that case refer: https://www.listendata.com/2016/07/insignificant-levels-of-categorical-variable.html
Coef can be interpreted as, if the bmi group changes from normal weight to over-weight group the rate of wt gain changes by -0.07 Logsitic regression to predict bmi groups within the recommended rate of weight gain:
# create a categorical variable for bmi within and outside recommended range
q5$bmi_flag <- ifelse(is.na(q5$rate_of_wtgain),NA,
ifelse((q5$bmi_category=="Underweight") &
(q5$rate_of_wtgain >=1) &
(q5$rate_of_wtgain <=1.3), 1,
ifelse((q5$bmi_category=="Normal Weight") &
(q5$rate_of_wtgain >=0.8) &
(q5$rate_of_wtgain <=1), 1,
ifelse((q5$bmi_category=="Overweight") &
(q5$rate_of_wtgain >=0.5) &
(q5$rate_of_wtgain <=0.7), 1,
ifelse((q5$bmi_category=="Obese") &
(q5$rate_of_wtgain >=0.4) &
(q5$rate_of_wtgain <=0.6), 1,0)))))
table(q5$bmi_flag)##
## 0 1
## 19617 3041
## bmi_category Normal Weight Obese Overweight Underweight Sum
## bmi_flag
## 0 7996 5548 5825 248 19617
## 1 1759 618 563 101 3041
## Sum 9755 6166 6388 349 22658
q5$obs_categories <- ifelse(q5$n <=6,"Low freq",
ifelse(q5$n > 6 & count_sub_week$n <=15,"Moderate freq",
"High freq"))
table(q5$obs_categories)##
## High freq Low freq Moderate freq
## 3424 11220 8014
## obs_categories High freq Low freq Moderate freq Sum
## bmi_flag
## 0 2960 9800 6857 19617
## 1 464 1420 1157 3041
## Sum 3424 11220 8014 22658
## Anova Table (Type II tests)
##
## Response: rate_of_wtgain
## Sum Sq Df F value Pr(>F)
## obs_categories 70.5 2 28.031 6.94e-13 ***
## Residuals 28496.7 22655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova between rate of wt gain and observation categories show that all the categories have different mean values.
Logistic regression between bmi flag and observation categories show that the observation categories are not the significant variables for the classification model (does not have impact when it predict of the classes)
The coef can be intrepreted as a patient low freq(less engagement), versus high freq(more engagement), changes the log odds of rate of weight gain within the recommended range by -0.07 similarly it changes by 0.07 in case of moderate freq.
##
## Call:
## glm(formula = bmi_flag ~ obs_categories, family = binomial(link = "logit"),
## data = q5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5584 -0.5584 -0.5202 -0.5202 2.0332
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.85306 0.04993 -37.113 <2e-16 ***
## obs_categoriesLow freq -0.07867 0.05744 -1.370 0.171
## obs_categoriesModerate freq 0.07362 0.05919 1.244 0.214
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17869 on 22657 degrees of freedom
## Residual deviance: 17856 on 22655 degrees of freedom
## AIC: 17862
##
## Number of Fisher Scoring iterations: 4
by:
- same as above
q6_whole <- q5_whole %>% filter(trimester == "T3")
g1 <- ggplot(q6_whole, aes(x=weight_diff)) + geom_histogram()
ggplotly(g1)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
q6_whole %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Practice groups
q6_prac <- q5_prac %>% filter(trimester == "T3")
q6_prac %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Age
q6_age <- q5_age %>% filter(trimester == "T3")
q6_age %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))BMI
- same as above
Whole population
q7_whole <- day_level_demo %>% filter(trimester %in% c("T3", "PP")) %>%
group_by(Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter(row_number()==n()) %>% ungroup() %>%
group_by(Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>%
summarise(weight_diff = diff(wt_avg))
g1 <- ggplot(q7_whole, aes(x=weight_diff)) + geom_histogram()
ggplotly(g1)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
q7_whole %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Practice groups
q7_prac <- day_level_demo %>% filter(trimester %in% c("T3", "PP")) %>%
group_by(Customer_Group, Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter(row_number()==n()) %>% ungroup() %>%
group_by(Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>%
mutate(weight_diff = diff(wt_avg)) %>% filter(row_number()==n()) %>% ungroup() %>%
group_by(Customer_Group) %>%
summarise(pt_count = n_distinct(Dim_Patient), avg_weight_diff = mean(weight_diff),
std_wt_diff = sd(weight_diff))
g1 <- ggplot(q7_prac, aes(x=avg_weight_diff)) + geom_histogram()
ggplotly(g1)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
q7_prac %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Age
q7_age <- age %>% filter(trimester %in% c("T3", "PP")) %>%
group_by(age_category, Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter(row_number()==n()) %>% ungroup() %>%
group_by(Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>%
mutate(weight_diff = diff(wt_avg)) %>% filter(row_number()==n()) %>% ungroup() %>%
group_by(age_category) %>%
summarise(pt_count = n_distinct(Dim_Patient), avg_weight_diff = mean(weight_diff),
std_wt_diff = sd(weight_diff))
q7_age %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))BMI
q7_bmi <- bmi %>% filter(trimester %in% c("T3", "PP")) %>%
group_by(Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter(row_number()==n()) %>% ungroup() %>%
group_by(Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>%
mutate(weight_diff = diff(wt_avg)) %>% filter(row_number()==n()) %>% ungroup() %>%
group_by(bmi_category) %>%
summarise(pt_count = n_distinct(Dim_Patient), avg_weight_diff = mean(weight_diff),
std_wt_diff = sd(weight_diff))
q7_bmi %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Null hypothesis is that the mean of the weight differences is same for all groups
age <- age %>% group_by(Dim_Patient) %>% arrange(Dim_Patient, date) %>%
mutate(weight_diff = c(0,diff(wt_avg)))
# Run ANOVA
anova_wt_age <- aov(weight_diff ~ age_category, data = age)
summary(anova_wt_age)## Df Sum Sq Mean Sq F value Pr(>F)
## age_category 6 2143 357.1 22.96 <2e-16 ***
## Residuals 241799 3760732 15.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Call:
## aov(formula = weight_diff ~ age_category, data = age)
##
## Terms:
## age_category Residuals
## Sum of Squares 2143 3760732
## Deg. of Freedom 6 241799
##
## Residual standard error: 3.943746
## Estimated effects may be unbalanced
As the p value is low we reject the null hypothesis, which represents the groups are different from each other. Further we us tukeyhsd to check which groups are statistically different from each other
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = weight_diff ~ age_category, data = age)
##
## $age_category
## diff lwr upr p adj
## 20-24-14-19 -0.19530016 -0.40161002 0.01100969 0.0773966
## 25-29-14-19 -0.38750736 -0.58352133 -0.19149340 0.0000001
## 30-34-14-19 -0.46883161 -0.66360666 -0.27405656 0.0000000
## 35-39-14-19 -0.42692858 -0.62701406 -0.22684310 0.0000000
## 40-44-14-19 -0.32098710 -0.58311919 -0.05885501 0.0056680
## 45-49-14-19 -0.78528212 -1.33981373 -0.23075052 0.0005919
## 25-29-20-24 -0.19220720 -0.28078745 -0.10362695 0.0000000
## 30-34-20-24 -0.27353145 -0.35933532 -0.18772758 0.0000000
## 35-39-20-24 -0.23162841 -0.32888675 -0.13437008 0.0000000
## 40-44-20-24 -0.12568694 -0.32097702 0.06960314 0.4818916
## 45-49-20-24 -0.58998196 -1.11622362 -0.06374030 0.0165013
## 30-34-25-29 -0.08132425 -0.13806951 -0.02457899 0.0004761
## 35-39-25-29 -0.03942121 -0.11233831 0.03349589 0.6863040
## 40-44-25-29 0.06652026 -0.11785961 0.25090013 0.9386147
## 45-49-25-29 -0.39777476 -0.92006587 0.12451635 0.2711895
## 35-39-30-34 0.04190304 -0.02761493 0.11142100 0.5636067
## 40-44-30-34 0.14784451 -0.03521772 0.33090674 0.2064748
## 45-49-30-34 -0.31645051 -0.83827792 0.20537690 0.5562369
## 40-44-35-39 0.10594147 -0.08276110 0.29464404 0.6460787
## 45-49-35-39 -0.35835355 -0.88218627 0.16547918 0.4037367
## 45-49-40-44 -0.46429502 -1.01482182 0.08623178 0.1640505
age group 14-19 and 20-24 are statiscally different from other groups
Similarlly test differences in bmi categories:
bmi <- bmi %>% group_by(Dim_Patient) %>% arrange(Dim_Patient, date) %>%
mutate(weight_diff = c(0,diff(wt_avg)))
# Run ANOVA
anova_wt_bmi <- aov(weight_diff ~ bmi_category, data = bmi)
summary(anova_wt_bmi)## Df Sum Sq Mean Sq F value Pr(>F)
## bmi_category 3 2948 982.6 63.2 <2e-16 ***
## Residuals 242869 3776042 15.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Call:
## aov(formula = weight_diff ~ bmi_category, data = bmi)
##
## Terms:
## bmi_category Residuals
## Sum of Squares 2948 3776042
## Deg. of Freedom 3 242869
##
## Residual standard error: 3.943051
## Estimated effects may be unbalanced
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = weight_diff ~ bmi_category, data = bmi)
##
## $bmi_category
## diff lwr upr p adj
## Obese-Normal Weight -0.24902295 -0.30069790 -0.19734799 0.0000000
## Overweight-Normal Weight -0.02775685 -0.07696784 0.02145415 0.4686745
## Underweight-Normal Weight 0.26555931 0.09683658 0.43428205 0.0003071
## Overweight-Obese 0.22126610 0.16398355 0.27854865 0.0000000
## Underweight-Obese 0.51458226 0.34333119 0.68583333 0.0000000
## Underweight-Overweight 0.29331616 0.12279241 0.46383992 0.0000586
Apart from overweight and normal weight, all groups are statistically different.
- Perhaps here we can interpolate for PP (only 6 weeks)????
Weight Gain:
wt_gain <- day_level_demo %>% filter(trimester %in% c("T1", "T3")) %>%
group_by(Customer_Group, Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter( (row_number()==1 & trimester== "T1") | (row_number()==n() & trimester== "T3")) %>%
ungroup() %>%
group_by(Customer_Group, bmi_category, Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>% arrange(Customer_Group, Dim_Patient, trimester) %>%
summarise(weight_gain = diff(wt_avg))
wt_gain_groups <- wt_gain %>% group_by(Customer_Group) %>% summarise(avg_gain = mean(weight_gain))
wt_gain_groups %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))matrix_2by2 <- function(bmi_cat, lr, ur) {
# over BMi, good engagement
og <- wt_gain %>% filter((bmi_category==bmi_cat) &
(engagement == "good") &
( weight_gain > ur | weight_gain < lr)) %>% select(Dim_Patient) %>% distinct()
# within BMi, good engagement
wg <- wt_gain %>% filter((bmi_category==bmi_cat) &
(engagement == "good") &
(weight_gain <=ur | weight_gain>= lr)) %>% select(Dim_Patient) %>% distinct()
# over BMi, bad engagement
ob <- wt_gain %>% filter((bmi_category==bmi_cat) &
(engagement == "bad") &
( weight_gain > ur | weight_gain < lr)) %>% select(Dim_Patient) %>% distinct()
# within BMi, bad engagement
wb <- wt_gain %>% filter((bmi_category==bmi_cat) &
(engagement == "bad") &
(weight_gain <= ur | weight_gain>= lr)) %>% select(Dim_Patient) %>% distinct()
MI <- matrix(c(dim(ob)[1], dim(wb)[1], dim(og)[1], dim(wg)[1]), nrow = 2)
dimnames(MI) <- list("BMI" = c("bmi_over","bmi_within"), "Engagemnt" = c("bad", "good"))
MI
return(MI)
}BMI underweight
## Engagemnt
## BMI bad good
## bmi_over 8 54
## bmi_within 10 90
# proportion table
prop.out <- prop.table(MI, margin = 1)
#the null hypothesis that the proportions in both groups are the same. In addition it will provide a confidence interval for the difference in proportions.
p.out <-prop.test(MI)
print("difference in proportions")## [1] "difference in proportions"
## prop 1
## 0.02903226
## [1] "For patients outside recommended range of BMi, we expect to observe 7 patients with bad engagement for every 55 patients with good engagement "
## [1] 4/27
## [1] "Likewise in the within recommended range og BMI group, we expect to observe 2 patients with bad engagement for every 23 patients with good engagement"
## [1] 1/9
# Percent increase = (Risk Ratio lower bound – 1) x 100
# Percent decrease = (1 – Risk Ratio upper bound) x 100
# Take odds ratio
or.out <- oddsratio(MI, rev="b")
or.out$measure## odds ratio with 95% C.I.
## BMI estimate lower upper
## bmi_within 1.000000 NA NA
## bmi_over 1.335387 0.4754595 3.639486
Odds of bad engagement are at least 47% higher for the outside goodommended weight gain group.
BMI normal weight
## Engagemnt
## BMI bad good
## bmi_over 175 1583
## bmi_within 253 2461
# proportion table
prop.out <- prop.table(MI, margin = 1)
#the null hypothesis that the proportions in both groups are the same. In addition it will provide a confidence interval for the difference in proportions.
p.out <-prop.test(MI)
p.out##
## 2-sample test for equality of proportions with continuity correction
##
## data: MI
## X-squared = 0.42274, df = 1, p-value = 0.5156
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.0119067 0.0245559
## sample estimates:
## prop 1 prop 2
## 0.09954494 0.09322034
## [1] "difference in proportions"
## prop 1
## 0.006324598
## [1] "For patients outside recommended range of BMi, we expect to observe 121 patients with bad engagement for every 1637 patients with good engagement "
## [1] 175/1583
## [1] "Likewise in the within recommended range of BMI group, we expect to observe 90 patients with bad engagement for every 1267 patients with good engagement"
## [1] 11/107
# Percent increase = (Risk Ratio lower bound – 1) x 100
# Percent decrease = (1 – Risk Ratio upper bound) x 100
# Take odds ratio
or.out <- oddsratio(MI, rev="b")
or.out$measure## odds ratio with 95% C.I.
## BMI estimate lower upper
## bmi_within 1.000000 NA NA
## bmi_over 1.075609 0.877076 1.316466
Odds of bad engagement are at least 81% higher for the outside goodommended weight gain group.
BMI over weight
## Engagemnt
## BMI bad good
## bmi_over 97 899
## bmi_within 161 1340
# proportion table
prop.out <- prop.table(MI, margin = 1)
#the null hypothesis that the proportions in both groups are the same. In addition it will provide a confidence interval for the difference in proportions.
p.out <-prop.test(MI)
p.out##
## 2-sample test for equality of proportions with continuity correction
##
## data: MI
## X-squared = 0.52777, df = 1, p-value = 0.4675
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.03487567 0.01513114
## sample estimates:
## prop 1 prop 2
## 0.09738956 0.10726183
## [1] "difference in proportions"
## prop 1
## -0.009872267
## [1] 97/899
## [1] 161/1340
# Percent increase = (Risk Ratio lower bound – 1) x 100
# Percent decrease = (1 – Risk Ratio upper bound) x 100
# Take odds ratio
or.out <- oddsratio(MI, rev="b")
or.out$measure## odds ratio with 95% C.I.
## BMI estimate lower upper
## bmi_within 1.0000000 NA NA
## bmi_over 0.8986128 0.6869524 1.17001
Odds of bad engagement are at least 81% higher for the outside goodommended weight gain group.
BMI Obese
## Engagemnt
## BMI bad good
## bmi_over 125 843
## bmi_within 190 1189
# proportion table
prop.out <- prop.table(MI, margin = 1)
#the null hypothesis that the proportions in both groups are the same. In addition it will provide a confidence interval for the difference in proportions.
p.out <-prop.test(MI)
p.out##
## 2-sample test for equality of proportions with continuity correction
##
## data: MI
## X-squared = 0.29548, df = 1, p-value = 0.5867
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.03740641 0.02010887
## sample estimates:
## prop 1 prop 2
## 0.1291322 0.1377810
## [1] "difference in proportions"
## prop 1
## -0.008648769
## [1] 125/843
## [1] 190/1189
# Percent increase = (Risk Ratio lower bound – 1) x 100
# Percent decrease = (1 – Risk Ratio upper bound) x 100
# Take odds ratio
or.out <- oddsratio(MI, rev="b")
or.out$measure## odds ratio with 95% C.I.
## BMI estimate lower upper
## bmi_within 1.0000000 NA NA
## bmi_over 0.9283116 0.7271287 1.181602
Weight loss:
wt_loss <- day_level_demo %>% filter(trimester %in% c("T3", "PP")) %>%
group_by(Customer_Group, Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter( (row_number()==n() & trimester== "T3") | (row_number()==n() & trimester== "PP")) %>%
ungroup() %>%
group_by(Customer_Group, Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>% arrange(Customer_Group, Dim_Patient, desc(trimester)) %>%
summarise(weight_loss = diff(wt_avg))
wt_loss_groups <- wt_loss %>% group_by(Customer_Group) %>% summarise(avg_loss = mean(weight_loss))
wt_loss_groups %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Age categories weight gain
age_wt_gain <- age %>% filter(trimester %in% c("T1", "T3")) %>%
group_by(age_category, Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter( (row_number()==1 & trimester== "T1") | (row_number()==n() & trimester== "T3")) %>%
ungroup() %>%
group_by(age_category, Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>%
arrange(age_category, Dim_Patient, trimester) %>%
summarise(weight_gain = diff(wt_avg)) %>%
group_by(age_category) %>%
summarise(avg_weight_gain = mean(weight_gain))
age_wt_gain %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))Age wt loss
age_wt_loss <- age %>% filter(trimester %in% c("T3", "PP")) %>%
group_by(age_category, Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter( (row_number()==n() & trimester== "T3") | (row_number()==n() & trimester== "PP")) %>%
ungroup() %>%
group_by(age_category, Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>% arrange(age_category, Dim_Patient, desc(trimester)) %>%
summarise(weight_loss = diff(wt_avg)) %>%
group_by(age_category) %>%
summarise(avg_weight_ = mean(weight_loss))
age_wt_loss %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))BMI Cateogries
Wt gain
bmi_wt_gain <- bmi %>% filter(trimester %in% c("T1", "T3")) %>%
group_by(bmi_category, Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter( (row_number()==1 & trimester== "T1") | (row_number()==n() & trimester== "T3")) %>%
ungroup() %>%
group_by(bmi_category, Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>%
arrange(bmi_category, Dim_Patient, trimester) %>%
summarise(weight_gain = diff(wt_avg)* (-1)) %>%
group_by(bmi_category) %>%
summarise(avg_weight_gain = mean(weight_gain), std_wt_gain = sd(weight_gain))
bmi_wt_gain %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))BMI wt loss
bmi_wt_loss <- bmi %>% filter(trimester %in% c("T3", "PP")) %>%
group_by(bmi_category, Dim_Patient, trimester) %>%
arrange(Dim_Patient,date) %>%
filter( (row_number()==n() & trimester== "T3") | (row_number()==n() & trimester== "PP")) %>%
ungroup() %>%
group_by(bmi_category, Dim_Patient) %>%
mutate(n = n()) %>%
filter(n>1) %>% arrange(bmi_category, Dim_Patient, desc(trimester)) %>%
summarise(weight_loss = diff(wt_avg)) %>%
group_by(bmi_category) %>%
summarise(avg_weight_ = mean(weight_loss))
bmi_wt_loss %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))BMI and wt change is not correlated
bmi <- bmi %>% group_by(Dim_Patient) %>% arrange(Dim_Patient, date) %>%
mutate(weight_diff = c(0,diff(wt_avg)))
cor(bmi$bmi, bmi$weight_diff)## [1] -0.02995791
## `geom_smooth()` using formula 'y ~ x'
Age and wt is not correlated
age <- age %>% group_by(Dim_Patient) %>% arrange(Dim_Patient, date) %>%
mutate(weight_diff = c(0,diff(wt_avg)))
cor(age$`Age_at_Activation_(years)`, age$weight_diff)## [1] -0.01817134
## `geom_smooth()` using formula 'y ~ x'